In order to widen Open Context's interoperability with other scientific information systems, we are starting to cross-reference Open Context published biological taxonomy categores with GBIF (Global Biodiversity Information Facility, https://gbif.org) identifiers.
To start this process, this Jupyter notebooks will find GBIF identifiers that correspond with EOL (Encyclopedia of Life, https://eol.org) identifiers already used by Open Context.
The datasets used and created by this notebook are stored in the /files/eol
directory. The files used and created by this notebook include:
eol-gbif.csv.gz
(This source of the data is: https://opendata.eol.org/dataset/identifier-map, dated 2019-12-20. The data is filtered to only include records where the resource_id is 767, which corresponds to GBIF.)oc-eol-uris.csv
(This is a CSV dump of from the Open Context, current as of 2020-01-15, link_entities model where URIs started with 'http://eol.org'. It represents all of the EOL entities that Open Context uses to cross-reference project-specific biological taxonomic concepts.)oc-eol-gbif-with-missing.csv
(This is the scratch, working data file that has oc-eol-uri.csv
data, with joined records from eol-gbif.csv
. Execution of this notebook creates this file and periodically updates this file with names and new IDs resulting from requests to the GBIF API.)oc-eol-gbif.csv
(This notebook generates this file which describes equivalences between the EOL items used by Open Context and corresponding GBIF identifiers.)oc-eol-no-gbif.csv
(This notebook generates this file which describes EOL items used by Open Context that lack corresponding GBIF identifiers. These records will probably need manual curation.)
In [1]:
import json
import os
import requests
from time import sleep
import numpy as np
import pandas as pd
# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))
# Path for the (gzip compressed) CSV data dump from EOL
# with GBIF names and EOL IDs.
eol_gbif_names_path = os.path.join(
repo_path, 'files', 'eol', 'eol-gbif.csv.gz'
)
# Path for the CSV data from Open Context of all EOL
# URIs and IDs currently referenced by Open Context.
oc_eol_path = os.path.join(
repo_path, 'files', 'eol', 'oc-eol-uris.csv'
)
# Path for the CSV data that has EOL URIs used by Open Context
# with GBIF URIs and missing GBIF URIs
oc_eol_gbif_w_missing_path = os.path.join(
repo_path, 'files', 'eol', 'oc-eol-gbif-with-missing.csv'
)
# Path for CSV data that has EOL URIs used by Open Context and
# corresponding GBIF URIs and Names.
oc_eol_gbif_path = os.path.join(
repo_path, 'files', 'eol', 'oc-eol-gbif.csv'
)
# Path for CSV data that has EOL URIs used by Open Context
# but no corresponding GBIF URIs.
oc_eol_no_gbif_path = os.path.join(
repo_path, 'files', 'eol', 'oc-eol-no-gbif.csv'
)
Now define some fuctions that we'll be using over and over.
In [2]:
def save_result_files(
df,
path_with_gbif=oc_eol_gbif_path,
path_without_gbif=oc_eol_no_gbif_path
):
"""Saves files for outputs with and without GBIF ids"""
# Save the interim results with matches
gbif_index = ~df['gbif_id'].isnull()
df_ok_gbif = df[gbif_index].copy().reset_index(drop=True)
print('Saving EOL matches with GBIF...')
df_ok_gbif.to_csv(path_with_gbif, index=False)
no_gbif_index = df['gbif_id'].isnull()
df_ok_gbif = df[no_gbif_index].copy().reset_index(drop=True)
print('Saving EOL records without GBIF matches...')
df_ok_gbif.to_csv(path_without_gbif, index=False)
def get_gbif_cannonical_name(gbif_id, sleep_secs=0.25):
"""Get the cannonical name from the GBIF API for an ID"""
sleep(sleep_secs)
url = 'https://api.gbif.org/v1/species/{}'.format(gbif_id)
print('Get URL: {}'.format(url))
r = requests.get(url)
r.raise_for_status()
json_r = r.json()
return json_r.get('canonicalName')
def get_gbif_vernacular_name(gbif_id, lang_code='eng', sleep_secs=0.25):
"""Get the first vernacular name from the GBIF API for an ID"""
sleep(sleep_secs)
url = 'http://api.gbif.org/v1/species/{}/vernacularNames'.format(
gbif_id
)
print('Get URL: {}'.format(url))
r = requests.get(url)
r.raise_for_status()
json_r = r.json()
vern_name = None
for result in json_r.get('results', []):
if result.get('language') != lang_code:
continue
vern_name = result.get("vernacularName")
if vern_name is not None:
break
return vern_name
def add_names_to_gbif_ids(
df,
limit_by_method=None,
save_path=oc_eol_gbif_w_missing_path
):
"""Adds names to GBIF ids where those names are missing"""
gbif_index = ~df['gbif_id'].isnull()
df.loc[gbif_index, 'gbif_uri'] = df[gbif_index]['gbif_id'].apply(
lambda x: 'https://www.gbif.org/species/{}'.format(int(x))
)
df.to_csv(save_path, index=False)
# Now use the GBIF API to fetch cannonical names for GBIF items
# where we do not yet have those names.
need_can_name_index = (df['gbif_can_name'].isnull() & gbif_index)
if limit_by_method:
need_can_name_index &= (df['gbif_rel_method'] == limit_by_method)
df.loc[need_can_name_index, 'gbif_can_name'] = df[need_can_name_index]['gbif_id'].apply(
lambda x: get_gbif_cannonical_name(int(x))
)
df.to_csv(save_path, index=False)
# Now use the GBIF API to fetch vernacular names for GBIF items
# where we do not yet have those names.
need_vern_name_index = (df['gbif_vern_name'].isnull() & gbif_index)
if limit_by_method:
need_vern_name_index &= (df['gbif_rel_method'] == limit_by_method)
df.loc[need_vern_name_index, 'gbif_vern_name'] = df[need_vern_name_index]['gbif_id'].apply(
lambda x: get_gbif_vernacular_name(int(x))
)
df.to_csv(save_path, index=False)
return df
def get_gbif_id_by_name(name, sleep_secs=0.25, allow_alts=False):
"""Get a GBIF ID by seatching a name via the GBIF API"""
sleep(sleep_secs)
if ' ' in name:
# Only use the first 2 parts of a name with a space
name_sp = name.split(' ')
# This also turns the space into a '+', good for URL enconding.
if len(name_sp[0]) <= 2 or len(name_sp[1]) <= 2:
return np.nan
name = name_sp[0] + '+' + name_sp[1]
url = 'https://api.gbif.org/v1/species/match?verbose=true&dataset_key=d7dddbf4-2cf0-4f39-9b2a-bb099caae36c'
url += '&name={}'.format(name)
print('Get URL: {}'.format(url))
r = requests.get(url)
r.raise_for_status()
json_r = r.json()
id = json_r.get('usageKey')
if id is not None:
return int(id)
elif not allow_alts:
# We don't have an ID, but we're not yet allowing alternatives
return np.nan
# Below is for multiple equal matches
if not allow_alts or json_r.get('matchType') != 'NONE':
# We don't have an exact match
return np.nan
alts = json_r.get('alternatives', [])
if len(alts) == 0:
# We don't have alternatives
return np.nan
# Chose the first alternative.
id = alts[0].get('usageKey')
if not id:
return np.nan
return int(id)
In [3]:
if not os.path.isfile(oc_eol_gbif_w_missing_path):
# We don't have the oc_eol_gbif_with missing data
# so we need to make it.
df_eol_gbif_names = pd.read_csv(eol_gbif_names_path)
df_oc_eol = pd.read_csv(oc_eol_path, encoding='utf-8')
df_oc_eol.rename(columns={'id': 'page_id'}, inplace=True)
df = df_oc_eol.merge(df_eol_gbif_names, on=['page_id'], how='left')
print('We have {} rows of EOL uris in OC to relate to GBIF'.format(
len(df.index)
)
)
df.sort_values(by=['page_id'], inplace=True)
# Now pull out the GBIF integer ID
df['gbif_id'] = pd.to_numeric(
df['resource_pk'],
errors='coerce',
downcast='integer'
)
df['gbif_rel_method'] = np.nan
df['gbif_uri'] = np.nan
df['gbif_can_name'] = np.nan
df['gbif_vern_name'] = np.nan
# Now note that the rows where the gbif_id is not null
# come from the EOL-GBIF names dataset
gbif_index = ~df['gbif_id'].isnull()
df.loc[gbif_index, 'gbif_rel_method'] = 'EOL-GBIF-names'
df.to_csv(oc_eol_gbif_w_missing_path, index=False)
In [4]:
# Get our working dataframe, now that we know that it
# must have been initially created.
df = pd.read_csv(oc_eol_gbif_w_missing_path)
Now that we have a main working dataset, we need to add cannonical and vernacular names to the GBIF IDs.
In [5]:
# Use GBIF API calls to add names to records with GBIF IDs but currently
# missing names.
df = add_names_to_gbif_ids(df, save_path=oc_eol_gbif_w_missing_path)
Now that we have added GBIF names to rows that have GBIF IDs, we will save our interim results.
In [6]:
# Save the Open Context EOL URIs with clear GBIF matches,
# as well as a file without matches
save_result_files(df)
At this point, we will still be missing GBIF IDs for many rows of EOL records. So now, we will use the GBIF search API to find related GBIF IDs.
In [7]:
# Now try to look up GBIF items where we don't have
# clear matches.
look_ups = [
# Tuples are:
# (field_for_name, allow_alts, gbif_rel_method,),
('preferred_canonical_for_page', False, 'EOL-pref-page-GBIF-exact-search',),
('preferred_canonical_for_page', True, 'EOL-pref-page-GBIF-search-w-alts',),
('label', False, 'EOL-OC-label-GBIF-exact-search',),
('label', True, 'EOL-OC-label-GBIF-search-w-alts',),
]
# Now iterate through these look_up configs.
for field_for_name, allow_alts, gbif_rel_method in look_ups:
gbif_index = ~df['gbif_id'].isnull()
ok_eol = df[gbif_index]['uri'].unique().tolist()
no_gbif_index = (df['gbif_id'].isnull() & ~df['uri'].isin(ok_eol))
# Get the index where there's a preferred_canonical_for_page (EOL) name, but
# where we have no GBIF id yet.
no_gbif_index_w_name = (~df[field_for_name].isnull() & no_gbif_index)
# Use the GBIF API to lookup GBIF IDs.
df.loc[no_gbif_index_w_name, 'gbif_id'] = df[no_gbif_index_w_name][field_for_name].apply(
lambda x: get_gbif_id_by_name(x, allow_alts=allow_alts)
)
# The new GBIF IDs will have a gbif_rel_method of null. Make sure that we record
# the gbif_rel_method at this point.
new_gbif_id_index = (~df['gbif_id'].isnull() & df['gbif_rel_method'].isnull())
df.loc[new_gbif_id_index, 'gbif_rel_method'] = gbif_rel_method
# Save the interim results
df.to_csv(oc_eol_gbif_w_missing_path, index=False)
# Now add names to the rows where we just found new IDs.
df = add_names_to_gbif_ids(
df,
limit_by_method=gbif_rel_method,
save_path=oc_eol_gbif_w_missing_path
)
# Save the interim results, again.
df.to_csv(oc_eol_gbif_w_missing_path, index=False)
# Save the interim results with matches to a file
# and without matches to another file.
save_result_files(df)